# The package "ezids" (EZ Intro to Data Science) includes a lot of the helper functions we developed for the course. 
# Importing the necessary libraries
library(ezids)
library(dplyr)
library(ggplot2)
library(DT)
library(corrplot)
library(lubridate)
library(tidyr)
library(scales)
library(cluster)
library(knitr)
library(kableExtra)
library(caret)
library(xgboost)
library(pROC)
library(e1071)
library(rpart)
library(rpart.plot)
library(randomForest)

Introduction

Our research focuses on exploring the featk

“Which are the top 5 app categories, as identified by classification models (logistic regression,SVM, XGBoost, KNN, and random forest), that significantly influenced app success (measured by installs) based on app data from 2010 to 2018, and how accurately can these models predict success trends within this time period??”

Loading the Dataset

Here, we have loaded the dataset ‘Google Play Store Apps’ stored in csv file using ()

#Loading the Dataset
data_apps <- data.frame(read.csv("googleplaystore.csv"))

Description of the App Dataset Columns

  1. App: The name of the application, represented as a character string.
  2. Category: The main category of the app, such as “ART_AND_DESIGN,” represented as a character string.
  3. Rating: The average user rating of the app, recorded as a numeric value.
  4. Reviews: The total number of user reviews for the app, shown as a character string.
  5. Size: The size of the application, represented as a character string.
  6. Installs: The approximate number of installations for the app, stored as a character string.
  7. Type: Indicates whether the app is free or paid, represented as a character string.
  8. Price: The price of the app, stored as a character string. Free apps are listed as “0,” while paid apps have a dollar amount.
  9. Content.Rating: The target age group for the app, represented as a character string.
  10. Genres: The genre(s) of the app.
  11. Last.Updated: The date of the app’s last update, stored as a character string.
  12. Current.Ver: The current version of the app, represented as a character string.
  13. Android.Ver: The minimum Android version required to run the app, stored as a character string.

Summary of dataset after Data Cleaning

#Dropping columns 
data_apps <- data_apps[, !(colnames(data_apps) %in% c("Android.Ver", "Current.Ver", "Type", "App", "Genres"))]

##Price----

### Convertion of Price to numerical 
data_apps$Price <- as.numeric(gsub("\\$", "", data_apps$Price)) #--Remove dollar symbols 
missing_na <- is.na(data_apps$Price)    
missing_blank <- data_apps$Price == "" 
data_apps <- data_apps[!is.na(data_apps$Price) & data_apps$Price != "", ] #-- Remove price is NA or blank

## Size---

#### Replacing Missing values with the mean (Size)
# Replace "Varies with Device" in the Size column with NA
data_apps$Size[data_apps$Size == "Varies with device"] <- NA #"Varies with Device" to NA
data_apps <- data_apps[!grepl("\\+", data_apps$Size), ]
data_apps$Size <- ifelse(grepl("k", data_apps$Size),
                          as.numeric(gsub("k", "", data_apps$Size)) *
0.001,  # Convert "K" to MB
                          as.numeric(gsub("M", "", data_apps$Size))) # Remove "M" for megabytes

# Calculate and display the mean size for each category in the 'Type' column
mean_size_by_type <- tapply(data_apps$Size, data_apps$Category,
mean, na.rm = TRUE)

# Loop through each row and replace NA values in the Size column with the mean size of the corresponding category
data_apps$Size <- ifelse(is.na(data_apps$Size),  # Check if Size is NA
  round(mean_size_by_type[data_apps$Category], 1), # Replace with the mean size based on the Category
  data_apps$Size)  # Keep the original size if it's not NA


##Installs---

####Remove the '+' sign, Remove the commas, Convert to numeric
#clean installations
clean_installs <- function(Installs) {
  Installs <- gsub("\\+", "", Installs)  
  Installs <- gsub(",", "", Installs)    
  return(as.numeric(Installs))           
}

data_apps$Installs <- sapply(data_apps$Installs, clean_installs)

nan_rows <- sapply(data_apps[, c("Size", "Installs")], function(x) any(is.nan(x)))

## Rating ---
data_apps <- data_apps %>%
  mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))

# Identify the unique values in the 'Installs' column
unique_values <- unique(data_apps$Installs)

# Function to convert the installs to numeric
convert_to_numeric <- function(x) {
  # Remove non-numeric characters and convert to numeric
  as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]

#Reviews---
data_apps$Reviews <- as.numeric(data_apps$Reviews)#Replace NA in Ratings with Overall Mean
data_apps <- data_apps %>%
  mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))

#Content rating---
data_apps <- data_apps %>%
  mutate(
    Content.Rating = as.factor(Content.Rating)
  )
data_apps$Content.Rating <- as.numeric(data_apps$Content.Rating)
#### Preprocessing for a model
#categories----
category_dummies <- model.matrix(~ Category - 1, data = data_apps)
colnames(category_dummies) <- gsub("Category", "cat", colnames(category_dummies))

# 3. Add dummy variables to the dataset and remove the original 'Category' column
data_apps <- cbind(data_apps, category_dummies)
data_apps$Category <- NULL

# 4. Replace spaces in column names with underscores
colnames(data_apps) <- gsub(" ", "_", colnames(data_apps))


#### Installs----
# Load necessary libraries

# Create two categories: Low Installs and High Installs
# Calculate the median of Installs to split into two categories
median_installs <- median(data_apps$Installs, na.rm = TRUE)


#Reclassify into two categories
data_apps$Installs_Category <- ifelse(data_apps$Installs <= median_installs, "Low Installs", "High Installs")


# Convert 'Installs_Category' to factor with levels "Low Installs" and "High Installs"
data_apps$Installs_Category <- factor(data_apps$Installs_Category,
                                         levels = c("Low Installs", "High Installs"),
                                         labels = c(0, 1))


# Check the conversion
table(data_apps$Installs_Category)
## 
##    0    1 
## 5890 4950
# Create a histogram for the new categories
ggplot(data_apps, aes(x = Installs_Category)) +
  geom_bar(stat = "count", fill = "skyblue", color = "black") +
  labs(title = "Histogram of Installs Category (Low vs High)",
       x = "Installs Category",
       y = "Count") +
  theme_minimal()

## Last updated----
# Convert the 'last_updated' column to Date type
data_apps$Last.Updated <- as.Date(data_apps$Last.Updated, format = "%B %d, %Y")

# Calculate the difference in days between the maximum date and each date in 'last_updated'
data_apps$Last.Updated <- as.numeric(difftime(max(data_apps$Last.Updated, na.rm = TRUE), 
                                       data_apps$Last.Updated, 
                                       units = "days"))
# Display summary using kable
summary(data_apps) %>%
  kable(caption = "Summary of Google Play Store Apps") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
  scroll_box(width = "100%", height = "400px")
Summary of Google Play Store Apps
Rating Reviews Size Installs Price Content.Rating Last.Updated catART_AND_DESIGN catAUTO_AND_VEHICLES catBEAUTY catBOOKS_AND_REFERENCE catBUSINESS catCOMICS catCOMMUNICATION catDATING catEDUCATION catENTERTAINMENT catEVENTS catFAMILY catFINANCE catFOOD_AND_DRINK catGAME catHEALTH_AND_FITNESS catHOUSE_AND_HOME catLIBRARIES_AND_DEMO catLIFESTYLE catMAPS_AND_NAVIGATION catMEDICAL catNEWS_AND_MAGAZINES catPARENTING catPERSONALIZATION catPHOTOGRAPHY catPRODUCTIVITY catSHOPPING catSOCIAL catSPORTS catTOOLS catTRAVEL_AND_LOCAL catVIDEO_PLAYERS catWEATHER Installs_Category
Min. :1.000 Min. : 0 Min. : 0.0085 Min. :0.000e+00 Min. : 0.000 Min. :1.000 Min. : 0.0 Min. :0.000000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000000 0:5890
1st Qu.:4.100 1st Qu.: 38 1st Qu.: 5.9000 1st Qu.:1.000e+03 1st Qu.: 0.000 1st Qu.:2.000 1st Qu.: 19.0 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1:4950
Median :4.200 Median : 2094 Median : 14.5000 Median :1.000e+05 Median : 0.000 Median :2.000 Median : 76.0 Median :0.000000 Median :0.000000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000000 NA
Mean :4.192 Mean : 444153 Mean : 21.2119 Mean :1.546e+07 Mean : 1.027 Mean :2.465 Mean : 259.7 Mean :0.005996 Mean :0.007841 Mean :0.004889 Mean :0.02131 Mean :0.04244 Mean :0.005535 Mean :0.0357 Mean :0.02159 Mean :0.01439 Mean :0.01375 Mean :0.005904 Mean :0.1819 Mean :0.03376 Mean :0.01172 Mean :0.1055 Mean :0.03146 Mean :0.008118 Mean :0.007841 Mean :0.03524 Mean :0.01264 Mean :0.04271 Mean :0.02611 Mean :0.005535 Mean :0.03616 Mean :0.0309 Mean :0.03911 Mean :0.02399 Mean :0.02721 Mean :0.03542 Mean :0.07777 Mean :0.0238 Mean :0.01614 Mean :0.007565 NA
3rd Qu.:4.500 3rd Qu.: 54776 3rd Qu.: 28.0000 3rd Qu.:5.000e+06 3rd Qu.: 0.000 3rd Qu.:2.000 3rd Qu.: 322.0 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 NA
Max. :5.000 Max. :78158306 Max. :100.0000 Max. :1.000e+09 Max. :400.000 Max. :6.000 Max. :3001.0 Max. :1.000000 Max. :1.000000 Max. :1.000000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :1.000000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.000000 NA

Logistic Regression

data_apps$Installs_Category <- as.factor(data_apps$Installs_Category)
data_apps <- data_apps[, !names(data_apps) %in% c('Installs')]

We first start with the basic Logistic model.

glm_model <- glm(Installs_Category ~ ., data = data_apps, family = binomial)
summary(glm_model)
## 
## Call:
## glm(formula = Installs_Category ~ ., family = binomial, data = data_apps)
## 
## Coefficients: (1 not defined because of singularities)
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.3820514  0.6211700   0.615  0.53852    
## Rating                 -0.6803505  0.0847476  -8.028 9.91e-16 ***
## Reviews                 0.0006374  0.0000192  33.202  < 2e-16 ***
## Size                   -0.0005744  0.0030275  -0.190  0.84953    
## Price                  -4.7944227  0.2228852 -21.511  < 2e-16 ***
## Content.Rating         -0.1635661  0.0680993  -2.402  0.01631 *  
## Last.Updated           -0.0006128  0.0001470  -4.169 3.06e-05 ***
## catART_AND_DESIGN      -0.4006310  0.7466491  -0.537  0.59156    
## catAUTO_AND_VEHICLES   -0.3219066  0.6526739  -0.493  0.62186    
## catBEAUTY               0.4362883  0.6833897   0.638  0.52320    
## catBOOKS_AND_REFERENCE -0.1883103  0.6018541  -0.313  0.75437    
## catBUSINESS            -0.7296793  0.5517898  -1.322  0.18604    
## catCOMICS              -0.7802165  0.7833083  -0.996  0.31922    
## catCOMMUNICATION       -0.6947830  0.6337572  -1.096  0.27295    
## catDATING              -0.3163013  0.5833206  -0.542  0.58765    
## catEDUCATION            0.9985535  0.6030813   1.656  0.09777 .  
## catENTERTAINMENT        0.2250374  0.8222913   0.274  0.78434    
## catEVENTS              -1.6690612  0.8893331  -1.877  0.06055 .  
## catFAMILY              -0.3178247  0.5048063  -0.630  0.52896    
## catFINANCE             -1.6174598  0.5894926  -2.744  0.00607 ** 
## catFOOD_AND_DRINK      -0.2992787  0.6568855  -0.456  0.64868    
## catGAME                -0.3539908  0.5335639  -0.663  0.50705    
## catHEALTH_AND_FITNESS  -0.1707530  0.5602267  -0.305  0.76052    
## catHOUSE_AND_HOME       0.7087588  0.6078017   1.166  0.24357    
## catLIBRARIES_AND_DEMO   0.1163682  0.6447283   0.180  0.85677    
## catLIFESTYLE           -0.4210295  0.5434017  -0.775  0.43846    
## catMAPS_AND_NAVIGATION -1.0259721  0.7273633  -1.411  0.15838    
## catMEDICAL             -1.4230374  0.5865690  -2.426  0.01526 *  
## catNEWS_AND_MAGAZINES  -0.9500342  0.6150160  -1.545  0.12241    
## catPARENTING            0.3630993  0.6559063   0.554  0.57986    
## catPERSONALIZATION      0.1339716  0.5607857   0.239  0.81118    
## catPHOTOGRAPHY          0.7174853  0.5586400   1.284  0.19902    
## catPRODUCTIVITY        -0.4885764  0.5612522  -0.871  0.38402    
## catSHOPPING            -0.0205549  0.5949104  -0.035  0.97244    
## catSOCIAL              -0.4726360  0.6194859  -0.763  0.44549    
## catSPORTS              -0.9393387  0.5765341  -1.629  0.10325    
## catTOOLS               -0.2909637  0.5195893  -0.560  0.57549    
## catTRAVEL_AND_LOCAL    -0.0263625  0.5833597  -0.045  0.96396    
## catVIDEO_PLAYERS        0.2862331  0.5872173   0.487  0.62595    
## catWEATHER                     NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14945.8  on 10839  degrees of freedom
## Residual deviance:  2803.6  on 10801  degrees of freedom
## AIC: 2881.6
## 
## Number of Fisher Scoring iterations: 15

Stepwise Selection for GLM

Next, the stepwise selection is proceeded based on the logistic model above. It can be seen that the model below has AIC value of 2850, indicating the below model performs better than the above (which has AIC of 2881).

stepwise_model <- stats::step(glm_model, direction = "both", trace = 0)

summary(stepwise_model)
## 
## Call:
## glm(formula = Installs_Category ~ Rating + Reviews + Price + 
##     Content.Rating + Last.Updated + catBUSINESS + catEDUCATION + 
##     catEVENTS + catFINANCE + catHOUSE_AND_HOME + catMAPS_AND_NAVIGATION + 
##     catMEDICAL + catNEWS_AND_MAGAZINES + catPHOTOGRAPHY + catSPORTS + 
##     catVIDEO_PLAYERS + catPERSONALIZATION + catBEAUTY + catPARENTING, 
##     family = binomial, data = data_apps)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             9.447e-02  3.582e-01   0.264 0.792003    
## Rating                 -6.786e-01  8.303e-02  -8.172 3.03e-16 ***
## Reviews                 6.376e-04  1.882e-05  33.874  < 2e-16 ***
## Price                  -4.798e+00  2.226e-01 -21.553  < 2e-16 ***
## Content.Rating         -1.782e-01  5.885e-02  -3.028 0.002464 ** 
## Last.Updated           -6.181e-04  1.406e-04  -4.395 1.11e-05 ***
## catBUSINESS            -4.264e-01  2.615e-01  -1.630 0.103014    
## catEDUCATION            1.301e+00  3.590e-01   3.624 0.000290 ***
## catEVENTS              -1.368e+00  7.478e-01  -1.829 0.067352 .  
## catFINANCE             -1.319e+00  3.359e-01  -3.927 8.62e-05 ***
## catHOUSE_AND_HOME       1.013e+00  3.660e-01   2.767 0.005656 ** 
## catMAPS_AND_NAVIGATION -7.234e-01  5.415e-01  -1.336 0.181578    
## catMEDICAL             -1.124e+00  3.281e-01  -3.427 0.000610 ***
## catNEWS_AND_MAGAZINES  -6.422e-01  3.757e-01  -1.710 0.087354 .  
## catPHOTOGRAPHY          1.024e+00  2.778e-01   3.688 0.000226 ***
## catSPORTS              -6.410e-01  3.113e-01  -2.059 0.039503 *  
## catVIDEO_PLAYERS        5.932e-01  3.292e-01   1.802 0.071563 .  
## catPERSONALIZATION      4.459e-01  2.820e-01   1.581 0.113860    
## catBEAUTY               7.416e-01  4.795e-01   1.547 0.121909    
## catPARENTING            6.601e-01  4.392e-01   1.503 0.132892    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14945.8  on 10839  degrees of freedom
## Residual deviance:  2810.1  on 10820  degrees of freedom
## AIC: 2850.1
## 
## Number of Fisher Scoring iterations: 16
glm_varImp <- varImp(stepwise_model, scale = FALSE)
print(glm_varImp)
##                          Overall
## Rating                  8.172170
## Reviews                33.874116
## Price                  21.552835
## Content.Rating          3.027785
## Last.Updated            4.395380
## catBUSINESS             1.630414
## catEDUCATION            3.624470
## catEVENTS               1.829315
## catFINANCE              3.926529
## catHOUSE_AND_HOME       2.767071
## catMAPS_AND_NAVIGATION  1.335912
## catMEDICAL              3.427031
## catNEWS_AND_MAGAZINES   1.709524
## catPHOTOGRAPHY          3.687768
## catSPORTS               2.058906
## catVIDEO_PLAYERS        1.801890
## catPERSONALIZATION      1.581080
## catBEAUTY               1.546812
## catPARENTING            1.502794
#install.packages('vip')
library(vip)


# Plot the top 10 important features with the custom color
vip_plot<-vip(stepwise_model, num_features = 10, bar = TRUE) 

vip_plot +
  # Set the custom color for bars
  geom_bar(stat = "identity", fill = "#365b6d")  +
  # Reversing the order of features for better visual flow
  coord_flip() +
  # Improving labels and text size
  theme_minimal() +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14, face = "bold"),
    plot.title = element_text(size = 16, face = "bold"),
    legend.position = "none"  # Hides the legend if not needed
  ) +
  ggtitle("Top 10 Important Features - GLM Model")

glm_probs <- predict(stepwise_model, data_apps, type = "response")

glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

# Generate confusion matrix
test_cm <- confusionMatrix(factor(glm_pred), factor(data_apps$Installs_Category))
print(test_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5763  370
##          1  127 4580
##                                        
##                Accuracy : 0.9542       
##                  95% CI : (0.95, 0.958)
##     No Information Rate : 0.5434       
##     P-Value [Acc > NIR] : < 2.2e-16    
##                                        
##                   Kappa : 0.9072       
##                                        
##  Mcnemar's Test P-Value : < 2.2e-16    
##                                        
##             Sensitivity : 0.9784       
##             Specificity : 0.9253       
##          Pos Pred Value : 0.9397       
##          Neg Pred Value : 0.9730       
##              Prevalence : 0.5434       
##          Detection Rate : 0.5316       
##    Detection Prevalence : 0.5658       
##       Balanced Accuracy : 0.9518       
##                                        
##        'Positive' Class : 0            
## 
# Extract confusion matrix table
cm_table <- as.table(test_cm)

# Convert the table to a data frame for ggplot
cm_df <- as.data.frame(cm_table)

cm_df <- as.data.frame(test_cm$table)
cm_df$Percentage <- cm_df$Freq / sum(cm_df$Freq) * 100

ggplot(cm_df, aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percentage), color = "white") +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Freq)), 
            size = 4, color = "black") +
  scale_fill_gradient2(low = "white", high = "#6c9286", 
                      midpoint = min(cm_df$Percentage)) +
  labs(title = "Confusion Matrix Heatmap",
       subtitle = sprintf("Overall Accuracy: %.1f%%", test_cm$overall["Accuracy"] * 100)) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 10)
  )

precision <- test_cm$byClass["Pos Pred Value"]  # Precision
recall <- test_cm$byClass["Sensitivity"]        # Recall
f1_score <- test_cm$byClass["F1"]               # F1 Score

cat("Precision: ", precision, "\n")
## Precision:  0.9396706
cat("Recall: ", recall, "\n")
## Recall:  0.978438
cat("F1 Score: ", f1_score, "\n")
## F1 Score:  0.9586626
# Predict probabilities
glm_probs <- predict(stepwise_model, data_apps, type = "response")

# Generate ROC curve
roc_curve <- roc(data_apps$Installs_Category, glm_probs)


# Plot ROC curve
plot(roc_curve, main = "ROC Curve")

# AUC (Area Under the Curve)
auc_value <- auc(roc_curve)
cat("AUC: ", auc_value, "\n")
## AUC:  0.9922682

KNN

Splitting the Data into Train and Test

# Split the data into features and target
features <- c("Rating", "Reviews", "Size", "Price", "Content.Rating", "Last.Updated",
              grep("^cat", names(data_apps), value = TRUE))
X <- data_apps[, features]
y <- data_apps$Installs_Category

# Split the data into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]

# Scale the features
preprocess_params <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preprocess_params, X_train)
X_test_scaled <- predict(preprocess_params, X_test)

Find optimal K using Cross validation

# Train KNN model with cross-validation to find optimal k
k_values <- seq(1, 20, by = 2)
cv_results <- lapply(k_values, function(k) {
  train_control <- trainControl(method = "cv", number = 5)
  knn_model <- train(x = X_train_scaled, 
                    y = y_train,
                    method = "knn",
                    tuneGrid = data.frame(k = k),
                    trControl = train_control)
  return(knn_model$results$Accuracy)
})

# Plot k values vs accuracy
k_accuracy_df <- data.frame(k = k_values, accuracy = unlist(cv_results))
ggplot(k_accuracy_df, aes(x = k, y = accuracy)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "K Values vs Cross-Validation Accuracy",
       x = "Number of Neighbors (k)",
       y = "Accuracy") +
  theme_minimal()

As we can see, the most optimal k is found by using CV and accuracy as metric, highest accuracy of about 78% is achieved.

Evaluation metrics for test data using optimal k

optimal_k <- k_values[which.max(cv_results)]
final_knn <- knn3(X_train_scaled, y_train, k = optimal_k)

# Make predictions
y_pred_prob <- predict(final_knn, X_test_scaled, type = "prob")
y_pred <- factor(ifelse(y_pred_prob[,2] > 0.5, 1, 0), levels = c(0, 1))


y_test_numeric <- as.numeric(as.character(y_test))

# Calculate performance metrics
conf_matrix <- confusionMatrix(y_pred, y_test)
roc_curve <- roc(as.numeric(y_test_numeric), y_pred_prob[,2])
auc_score <- auc(roc_curve)

# Create performance summary table
performance_summary <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1 Score", "AUC"),
  Value = c(
    conf_matrix$overall["Accuracy"],
    conf_matrix$byClass["Precision"],
    conf_matrix$byClass["Recall"],
    conf_matrix$byClass["F1"],
    auc_score
  )
)
# Display performance summary using kable
kable(performance_summary, caption = "KNN Model Performance Summary") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
KNN Model Performance Summary
Metric Value
Accuracy Accuracy 0.7610701
Precision Precision 0.7857968
Recall Recall 0.7702320
F1 F1 Score 0.7779366
AUC 0.8324551
#Confusion matrix heat map
conf_matrix_df <- as.data.frame(conf_matrix$table)
conf_matrix_df$Percentage <- conf_matrix_df$Freq / sum(conf_matrix_df$Freq) * 100

ggplot(conf_matrix_df, aes(x = Prediction, y = Reference)) +
  geom_tile(aes(fill = Percentage), color = "white") +
  geom_text(aes(label = sprintf("%.1f%%\n(n=%d)", Percentage, Freq)), 
            size = 4, color = "black") +
  scale_fill_gradient2(low = "white", high = "#4A90E2", 
                      midpoint = min(conf_matrix_df$Percentage)) +
  labs(title = "Confusion Matrix Heatmap",
       subtitle = sprintf("Overall Accuracy: %.1f%%", conf_matrix$overall["Accuracy"] * 100)) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    legend.title = element_text(size = 10)
  )

# Plot ROC curve
plot(roc_curve, main = "ROC Curve for KNN Model",
     col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "gray")
legend("bottomright", legend = sprintf("AUC = %.3f", auc_score))

The AUC for the model is 0.83 which is good. But the performance metrics such as accuracy, precision are anticipated to have an above 80% score.

Feature importance

calc_importance <- function(feature) {
  X_test_permuted <- X_test_scaled
  X_test_permuted[,feature] <- sample(X_test_scaled[,feature])
  pred_permuted <- predict(final_knn, X_test_permuted, type = "prob")
  roc_permuted <- roc(as.numeric(y_test), pred_permuted[,2])
  return(auc_score - auc(roc_permuted))
}

importance_scores <- sapply(features, calc_importance)
importance_df <- data.frame(
  Feature = features,
  Importance = importance_scores
)
importance_df <- importance_df[order(-importance_df$Importance),]

# Plot feature importance
ggplot(importance_df[1:39,], aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance (KNN Model)",
       x = "Feature",
       y = "Importance Score") +
  theme_minimal()

The top 5 features contributing to the model are Size, Rating, Last updated, Content Rating and Reviews.

#————————————————————————-

Decision Tree

Splitting the data into train and test split

# Remove the Installs and Installs numerical columns
data <- data_apps[, !colnames(data_apps) %in% c("Installs")]

# Split the data into training and testing sets
set.seed(123)  # Ensure reproducibility
trainIndex <- createDataPartition(data$Installs_Category, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData<- data[-trainIndex, ]
# Fit the decision tree model
set.seed(42)
tree_model <- rpart(
  Installs_Category ~ . ,
  data = trainData,
  method = "class"
)

# Plot the decision tree
rpart.plot(tree_model, main = "Decision Tree for Predicting Installs Category")

# Predict on training and test datasets
train_predictions <- predict(tree_model, trainData, type = "class")
test_predictions <- predict(tree_model, testData, type = "class")



# Calculate accuracy
train_accuracy <- sum(train_predictions == trainData$Installs_Category) / nrow(trainData)
test_accuracy <- sum(test_predictions == testData$Installs_Category) / nrow(testData)

# Print accuracy results
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy:  0.9519142
cat("Test Accuracy: ", test_accuracy, "\n")
## Test Accuracy:  0.9488007

Why shift to Random Forest? High Dimensionality: With 41 variables, random forest handles many features better and can identify the most important ones.
Feature Importance: Random forest provides a ranking of feature importance, helping us understand which variables influence the Installs_Category.
Accuracy: Random forest generally has better predictive accuracy for larger and more complex datasets.

Random Forest

In this analysis, we employ a Random Forest model to predict the number of installs based on the top 5 app categories. The Random Forest algorithm is a robust ensemble learning method that builds multiple decision trees and combines their predictions to improve accuracy and reduce overfitting.

Building Random FOrest Classifier

# Train the random forest model
set.seed(123)
rf_model <- randomForest(Installs_Category ~ .,
                         data = trainData,
                         ntree = 500,       # Number of trees
                         mtry = 4,  # Number of predictors sampled at each split
                         importance = TRUE) # Enable importance calculation

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = Installs_Category ~ ., data = trainData,      ntree = 500, mtry = 4, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 4.35%
## Confusion matrix:
##      0    1 class.error
## 0 4547  165  0.03501698
## 1  212 3748  0.05353535

Plotting the Random Forest Result

# Plot the Random Forest model
plot(rf_model, main = "Random Forest Model Performance")

# Add a legend to explain the colors
legend("topright",                              
       legend = c("OOB Error", "Class 1 Error", "Class 2 Error"), 
       col = c("black", "red", "green"),       
       lty = 1,                                
       cex = 0.8)   

Testing the accuracy

# Predictions on the training set
train_predictions <- predict(rf_model, trainData)

# Predictions on the testing set
test_predictions <- predict(rf_model, testData)


# Confusion Matrix for Training Data
 train_cm <- confusionMatrix(train_predictions, trainData$Installs_Category)
 print(train_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4617  122
##          1   95 3838
##                                           
##                Accuracy : 0.975           
##                  95% CI : (0.9715, 0.9782)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9495          
##                                           
##  Mcnemar's Test P-Value : 0.07756         
##                                           
##             Sensitivity : 0.9798          
##             Specificity : 0.9692          
##          Pos Pred Value : 0.9743          
##          Neg Pred Value : 0.9758          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5324          
##    Detection Prevalence : 0.5465          
##       Balanced Accuracy : 0.9745          
##                                           
##        'Positive' Class : 0               
## 
# Confusion Matrix for Testing Data
test_cm <- confusionMatrix(test_predictions, testData$Installs_Category)
print(test_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1132   53
##          1   46  937
##                                           
##                Accuracy : 0.9543          
##                  95% CI : (0.9447, 0.9627)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9079          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9610          
##             Specificity : 0.9465          
##          Pos Pred Value : 0.9553          
##          Neg Pred Value : 0.9532          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5221          
##    Detection Prevalence : 0.5466          
##       Balanced Accuracy : 0.9537          
##                                           
##        'Positive' Class : 0               
## 

Checking for training, testing and OOB datasets

# Performance on Training Data
train_predictions <- predict(rf_model, newdata = trainData)
train_conf_matrix <- table(train_predictions, trainData$Installs_Category)
train_accuracy <- sum(diag(train_conf_matrix)) / sum(train_conf_matrix)
cat("Training Accuracy: ", train_accuracy, "\n")
## Training Accuracy:  0.9749769
# Performance on Testing Data
test_predictions <- predict(rf_model, newdata = testData)
test_conf_matrix <- table(test_predictions, testData$Installs_Category)
test_accuracy <- sum(diag(test_conf_matrix)) / sum(test_conf_matrix)
cat("Testing Accuracy: ", test_accuracy, "\n")
## Testing Accuracy:  0.9543358
# OOB Error from the model
oob_error <- rf_model$err.rate[500, "OOB"]
oob_accuracy <- 1 - oob_error
cat("OOB Accuracy: ", oob_accuracy, "\n")
## OOB Accuracy:  0.9565268
# Compare Results
comparison <- data.frame(
  Dataset = c("Training", "Testing", "OOB"),
  Accuracy = c(train_accuracy, test_accuracy, oob_accuracy)
)
print(comparison)
##    Dataset  Accuracy
## 1 Training 0.9749769
## 2  Testing 0.9543358
## 3      OOB 0.9565268

AUC/ROC

# Install and load pROC package if not already installed
if (!require("pROC")) install.packages("pROC", dependencies = TRUE)

# Get predicted probabilities for the positive class (class '1')
rf_prob <- predict(rf_model, testData, type = "prob")[, 2] # Probabilities for class '1'

# Compute ROC curve and AUC
roc_curve <- roc(testData$Installs_Category, rf_prob)

# Plot the ROC curve
plot(roc_curve, col = "blue", lwd = 2, main = "ROC Curve for Random Forest")
abline(a = 0, b = 1, lty = 2, col = "gray") # Diagonal reference line

# Display the AUC value
auc_value <- auc(roc_curve)
cat("AUC:", auc_value, "\n")
## AUC: 0.9881922

Feature Importance Values

# Variable importance
importance(rf_model)
##                                 0            1 MeanDecreaseAccuracy
## Rating                 15.1142447  19.94723199           22.5997876
## Reviews                90.0067853 100.26035812          101.9665752
## Size                   11.8994820  17.27289181           20.2294097
## Price                  37.1396369  41.59484288           45.7099624
## Content.Rating         10.1721956  10.05626857           13.8741454
## Last.Updated           17.5212482  16.76819395           24.0306055
## catART_AND_DESIGN      -1.7119601  -0.45450740           -1.6004086
## catAUTO_AND_VEHICLES    2.1287644   7.30700185            7.0864851
## catBEAUTY              -1.6937161   4.43340758            2.3915006
## catBOOKS_AND_REFERENCE -1.4109002   2.40811737            0.9254431
## catBUSINESS            -2.9786640  11.46650484           10.2307574
## catCOMICS               2.6287264   2.40038960            3.7801295
## catCOMMUNICATION        0.3875751  -0.94118112           -0.3435919
## catDATING               4.8477990  -0.39074922            3.4658472
## catEDUCATION           12.0764045   7.85370158           12.9867373
## catENTERTAINMENT       14.4382698   1.32379128           14.7495594
## catEVENTS               6.6402383  10.65844758           11.2020113
## catFAMILY              -5.9215812   6.69886354            2.4785553
## catFINANCE              0.8756648   5.90079226            5.1381823
## catFOOD_AND_DRINK      -4.4615018   0.43503127           -3.1082568
## catGAME                12.3678276   0.23835409           12.5083958
## catHEALTH_AND_FITNESS   1.4680166   3.83750346            3.7565491
## catHOUSE_AND_HOME      -1.2091642   2.71761141            1.1205213
## catLIBRARIES_AND_DEMO   1.0719952   7.41387595            7.0258040
## catLIFESTYLE           -1.7178046   6.10759331            5.2129387
## catMAPS_AND_NAVIGATION -3.7395597   1.04149969           -1.2763125
## catMEDICAL              6.6464141  20.04470310           20.3154344
## catNEWS_AND_MAGAZINES   1.5084049   6.83054946            6.2625470
## catPARENTING           -1.5158336   4.68584942            2.3270685
## catPERSONALIZATION     -0.1349033   3.41111809            2.6745840
## catPHOTOGRAPHY         14.3055236   5.00877316           15.4154186
## catPRODUCTIVITY        -4.6589723   1.97159358           -1.0439910
## catSHOPPING             7.9015188   1.67789040            8.5681436
## catSOCIAL               3.0757349  -0.93364078            2.3564318
## catSPORTS               1.6490382   0.37976680            1.3831812
## catTOOLS               -3.8679728   0.19000491           -1.6527781
## catTRAVEL_AND_LOCAL    -2.8476202   0.61812514           -1.7685359
## catVIDEO_PLAYERS       -0.7382980  -0.01421662           -0.5999692
## catWEATHER              3.2343068  -2.54911142            1.2265776
##                        MeanDecreaseGini
## Rating                       258.205076
## Reviews                     2341.076881
## Size                         202.836810
## Price                        105.230783
## Content.Rating                32.068778
## Last.Updated                 276.565693
## catART_AND_DESIGN              2.632787
## catAUTO_AND_VEHICLES           4.148712
## catBEAUTY                      2.755840
## catBOOKS_AND_REFERENCE         4.978577
## catBUSINESS                   12.245818
## catCOMICS                      2.994492
## catCOMMUNICATION               4.271465
## catDATING                      5.816032
## catEDUCATION                   9.973986
## catENTERTAINMENT              13.860234
## catEVENTS                      4.870598
## catFAMILY                     13.905954
## catFINANCE                     5.811543
## catFOOD_AND_DRINK              2.055011
## catGAME                       40.202925
## catHEALTH_AND_FITNESS          5.479474
## catHOUSE_AND_HOME              2.958173
## catLIBRARIES_AND_DEMO          4.038455
## catLIFESTYLE                   6.265143
## catMAPS_AND_NAVIGATION         1.791088
## catMEDICAL                    43.762990
## catNEWS_AND_MAGAZINES          4.494912
## catPARENTING                   2.847382
## catPERSONALIZATION             4.814103
## catPHOTOGRAPHY                18.729702
## catPRODUCTIVITY                4.511257
## catSHOPPING                    7.480957
## catSOCIAL                      4.087498
## catSPORTS                      3.295354
## catTOOLS                       7.597817
## catTRAVEL_AND_LOCAL            2.957056
## catVIDEO_PLAYERS               3.473253
## catWEATHER                     3.001699
# Plot variable importance
varImpPlot(rf_model)

#### Visualization for Feature Importance

# Extract importance values
importance_values <- importance(rf_model)
importance_df <- data.frame(
  Feature = rownames(importance_values),
  MeanDecreaseAccuracy = importance_values[, "MeanDecreaseAccuracy"],
  MeanDecreaseGini = importance_values[, "MeanDecreaseGini"]
)
# Plot Mean Decrease in Accuracy
accuracy_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(
    title = "Feature Importance (Mean Decrease in Accuracy)",
    x = "Features",
    y = "Importance"
  ) +
  theme_minimal() +
  theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))

# Plot the accuracy plot
print(accuracy_plot)

# Save the plot with larger dimensions
# ggsave("feature_importance_accuracy_large.png", plot = accuracy_plot, width = 12, height = 10, dpi = 300)

# Plot Mean Decrease in Gini
gini_plot <- ggplot(importance_df, aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(
    title = "Feature Importance (Mean Decrease in Gini)",
    x = "Features",
    y = "Importance"
  ) +
  theme_minimal() +
  theme(text = element_text(size = 12), axis.text.y = element_text(size = 10))

# Plot the gini index plot
print(gini_plot)

# Save the plot with larger dimensions
# ggsave("feature_importance_gini_large.png", plot = gini_plot, width = 12, height = 10, dpi = 300)

Gradient Boosting

Data Splitting

# Separate features (X) and target (y)
#X <- data_final %>% select(-Installs, -Success)
X <- data_apps %>% select(-Installs_Category)  # Exclude the target variable
y <- data_apps$Installs_Category             # Extract the target variable


table(y)
## y
##    0    1 
## 5890 4950
#Split into training and testing sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.7, list = FALSE)

# Define X_train, X_test, y_train, y_test
X_train <- X[train_index, ] %>% mutate(across(everything(), as.numeric))
X_test <- X[-train_index, ] %>% mutate(across(everything(), as.numeric))
y_train <- as.numeric(as.character(y[train_index]))  # Convert to numeric
y_test <- as.numeric(as.character(y[-train_index]))  # Convert to numeric
# Convert data to matrix for XGBoost
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)

data_apps$Installs_Category <- factor(data_apps$Installs_Category, levels = c(0, 1))

Train Gradient Boosting Model

# 4.  --------------------------------------

params <- list(
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",
  max_depth = 6,
  eta = 0.1,
  subsample = 0.8,
  colsample_bytree = 0.8
)

# Train the model
set.seed(42)
xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,
  watchlist = list(train = dtrain, test = dtest),
  early_stopping_rounds = 10,
  verbose = 1
)
## [1]  train-logloss:0.609051  test-logloss:0.610845 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [2]  train-logloss:0.540069  test-logloss:0.543209 
## [3]  train-logloss:0.482265  test-logloss:0.486837 
## [4]  train-logloss:0.434053  test-logloss:0.440368 
## [5]  train-logloss:0.392603  test-logloss:0.399988 
## [6]  train-logloss:0.357633  test-logloss:0.366472 
## [7]  train-logloss:0.326262  test-logloss:0.335497 
## [8]  train-logloss:0.299307  test-logloss:0.309752 
## [9]  train-logloss:0.275127  test-logloss:0.286471 
## [10] train-logloss:0.254064  test-logloss:0.266038 
## [11] train-logloss:0.246028  test-logloss:0.258059 
## [12] train-logloss:0.228166  test-logloss:0.241110 
## [13] train-logloss:0.212535  test-logloss:0.226162 
## [14] train-logloss:0.199030  test-logloss:0.213673 
## [15] train-logloss:0.186369  test-logloss:0.202031 
## [16] train-logloss:0.175329  test-logloss:0.191756 
## [17] train-logloss:0.165163  test-logloss:0.182054 
## [18] train-logloss:0.155909  test-logloss:0.173148 
## [19] train-logloss:0.147937  test-logloss:0.165724 
## [20] train-logloss:0.140798  test-logloss:0.159106 
## [21] train-logloss:0.134419  test-logloss:0.153387 
## [22] train-logloss:0.128485  test-logloss:0.147681 
## [23] train-logloss:0.126779  test-logloss:0.146159 
## [24] train-logloss:0.121702  test-logloss:0.141489 
## [25] train-logloss:0.120300  test-logloss:0.140252 
## [26] train-logloss:0.115684  test-logloss:0.136209 
## [27] train-logloss:0.111563  test-logloss:0.132826 
## [28] train-logloss:0.107759  test-logloss:0.129496 
## [29] train-logloss:0.104358  test-logloss:0.126577 
## [30] train-logloss:0.101191  test-logloss:0.124088 
## [31] train-logloss:0.098429  test-logloss:0.121812 
## [32] train-logloss:0.096048  test-logloss:0.119849 
## [33] train-logloss:0.093736  test-logloss:0.118078 
## [34] train-logloss:0.091855  test-logloss:0.116222 
## [35] train-logloss:0.089978  test-logloss:0.114691 
## [36] train-logloss:0.088334  test-logloss:0.113315 
## [37] train-logloss:0.086624  test-logloss:0.111863 
## [38] train-logloss:0.085822  test-logloss:0.111504 
## [39] train-logloss:0.084222  test-logloss:0.110214 
## [40] train-logloss:0.083036  test-logloss:0.109247 
## [41] train-logloss:0.081971  test-logloss:0.108476 
## [42] train-logloss:0.080807  test-logloss:0.107580 
## [43] train-logloss:0.079924  test-logloss:0.106761 
## [44] train-logloss:0.078889  test-logloss:0.106185 
## [45] train-logloss:0.077890  test-logloss:0.105556 
## [46] train-logloss:0.077019  test-logloss:0.104972 
## [47] train-logloss:0.076692  test-logloss:0.104888 
## [48] train-logloss:0.076087  test-logloss:0.104560 
## [49] train-logloss:0.075228  test-logloss:0.103823 
## [50] train-logloss:0.074563  test-logloss:0.103500 
## [51] train-logloss:0.073904  test-logloss:0.103389 
## [52] train-logloss:0.073342  test-logloss:0.103047 
## [53] train-logloss:0.072766  test-logloss:0.102632 
## [54] train-logloss:0.072190  test-logloss:0.102346 
## [55] train-logloss:0.071809  test-logloss:0.102039 
## [56] train-logloss:0.071340  test-logloss:0.101941 
## [57] train-logloss:0.070922  test-logloss:0.101906 
## [58] train-logloss:0.070537  test-logloss:0.101871 
## [59] train-logloss:0.070109  test-logloss:0.101728 
## [60] train-logloss:0.069795  test-logloss:0.101409 
## [61] train-logloss:0.069435  test-logloss:0.101432 
## [62] train-logloss:0.069234  test-logloss:0.101196 
## [63] train-logloss:0.068961  test-logloss:0.100983 
## [64] train-logloss:0.068728  test-logloss:0.101066 
## [65] train-logloss:0.068271  test-logloss:0.101246 
## [66] train-logloss:0.067911  test-logloss:0.101329 
## [67] train-logloss:0.067658  test-logloss:0.101267 
## [68] train-logloss:0.067465  test-logloss:0.101118 
## [69] train-logloss:0.066696  test-logloss:0.101152 
## [70] train-logloss:0.066292  test-logloss:0.101252 
## [71] train-logloss:0.066050  test-logloss:0.101360 
## [72] train-logloss:0.065654  test-logloss:0.101406 
## [73] train-logloss:0.065511  test-logloss:0.101405 
## Stopping. Best iteration:
## [63] train-logloss:0.068961  test-logloss:0.100983

Model Evaluation

# 

# Make predictions
y_pred <- predict(xgb_model, dtest)
y_pred_class <- ifelse(y_pred > 0.5, 1, 0)

# Confusion Matrix
y_pred_class <- factor(y_pred_class, levels = c(0, 1))
y_test <- factor(y_test, levels = c(0, 1))
conf_matrix <- confusionMatrix(y_pred_class, y_test)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1698   67
##          1   69 1418
##                                           
##                Accuracy : 0.9582          
##                  95% CI : (0.9507, 0.9648)
##     No Information Rate : 0.5434          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9157          
##                                           
##  Mcnemar's Test P-Value : 0.9317          
##                                           
##             Sensitivity : 0.9610          
##             Specificity : 0.9549          
##          Pos Pred Value : 0.9620          
##          Neg Pred Value : 0.9536          
##              Prevalence : 0.5434          
##          Detection Rate : 0.5221          
##    Detection Prevalence : 0.5427          
##       Balanced Accuracy : 0.9579          
##                                           
##        'Positive' Class : 0               
## 

Confusion Matrix:The model struggles significantly with distinguishing between “Low Installs” and “High Installs,” especially misclassifying many “Low Installs” as “High Installs” (1414 cases). Precision for “High Installs” (proportion of correct “High Installs” predictions) and recall for “Low Installs” (proportion of correctly identified “Low Installs”) are low.

# AUC and ROC Curve
roc_obj <- roc(as.numeric(as.character(y_test)), y_pred)
auc_value <- auc(roc_obj)
cat("AUC:", auc_value, "\n")
## AUC: 0.993891
# Plot ROC Curve
plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")

The ROC curve illustrates the model’s performance by plotting Sensitivity (True Positive Rate) against 1 - Specificity (False Positive Rate) across thresholds. The closer the curve approaches the top-left corner, the better the model distinguishes between “Low Installs” and “High Installs.” A high AUC value indicates strong classification performance.

Feature Importance

importance_matrix <- xgb.importance(feature_names = colnames(X_train), model = xgb_model)
xgb.plot.importance(importance_matrix, top_n = 10, main = "Feature Importance")

# 7. Save Model ---------------------------------------------------------

xgb.save(xgb_model, "xgb_app_success.model")
## [1] TRUE
# Summary
cat("Gradient Boosting achieved an accuracy of", conf_matrix$overall["Accuracy"],
    "and AUC of", auc_value, "\n")
## Gradient Boosting achieved an accuracy of 0.9581796 and AUC of 0.993891

“Reviews” has the highest influence, followed by “Rating” and “Last.Updated.” Features like “Price” and “Size” also contribute, albeit minimally. Other categorical variables, such as “catFAMILY” and “catMEDICAL,” have negligible impact on the model’s predictions.

SVM

# Convert target variable to a factor
# Split the data into features and target
y <- as.factor(data_apps$Installs_Category)
X <- data_apps[, !names(data_apps) %in% c('Installs_Category')]

# Split data into training and testing sets
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.75, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

Checking if the boundary is non-linear or linear

library(plotly)

# Prepare the plot data
plot_data <- data.frame(X_train, Class = as.factor(y_train))

# Create 3D scatter plot for features 1, 2, and 3
plot_1_2_3 <- plot_ly(data = plot_data, 
                      x = ~X_train[, 1], 
                      y = ~X_train[, 2], 
                      z = ~X_train[, 3], 
                      color = ~Class,
                      colors = c("red", "blue"),  # Set colors for classes (0 and 1)
                      type = 'scatter3d', 
                      mode = 'markers') %>%
  layout(title = "3D Scatter Plot: Feature 1 vs Feature 2 vs Feature 3",
         scene = list(xaxis = list(title = colnames(X_train)[1]),
                      yaxis = list(title = colnames(X_train)[2]),
                      zaxis = list(title = colnames(X_train)[3])))

# Create 3D scatter plot for features 4, 5, and 6
plot_4_5_6 <- plot_ly(data = plot_data, 
                      x = ~X_train[, 4], 
                      y = ~X_train[, 5], 
                      z = ~X_train[, 6], 
                      color = ~Class,
                      colors = c("red", "blue"),  # Set colors for classes (0 and 1)
                      type = 'scatter3d', 
                      mode = 'markers') %>%
  layout(title = "3D Scatter Plot: Feature 4 vs Feature 5 vs Feature 6",
         scene = list(xaxis = list(title = colnames(X_train)[4]),
                      yaxis = list(title = colnames(X_train)[5]),
                      zaxis = list(title = colnames(X_train)[6])))

# Show plots
plot_1_2_3
plot_4_5_6

As we can see we cannot decide if the boundary is linear or non-linear hence, lets make two models linear and non-linear SVM to check which one is a better fit.

Tuning to find the best parameter values for C and Gamma for SVM non-linear

# Load necessary libraries
library(e1071)
library(caret)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)

# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100),
                          sigma = c(0.5, 1))

# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
                   method = "svmRadial",
                   tuneGrid = tune_grid,
                   trControl = train_control,scaled = TRUE)

# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 8131 samples
##   39 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504 
## Resampling results across tuning parameters:
## 
##   C      sigma  Accuracy   Kappa    
##     0.1  0.5    0.7152887  0.4135606
##     0.1  1.0    0.7103702  0.3989988
##     1.0  0.5    0.7653439  0.5245875
##     1.0  1.0    0.7691565  0.5316017
##    10.0  0.5    0.7864964  0.5679965
##    10.0  1.0    0.7775180  0.5503263
##   100.0  0.5    0.7925222  0.5808094
##   100.0  1.0    0.7809624  0.5579036
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5 and C = 100.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
##   sigma   C
## 7   0.5 100

As seen for the training set the best accuracy is achieved when C = 100 and gamma is 0.5

Tuning to find the best parameter values for C for SVM linear

# Load necessary libraries
library(e1071)
library(caret)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Set up k-fold cross-validation
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)

# Define the tuning grid for 'C' and 'sigma' (gamma)
tune_grid <- expand.grid(C = c( 0.1, 1, 10, 100, 1000))

# Train the SVM model using radial kernel with the tuning grid
svm_model <- train(y_train ~ ., data = train_data,
                   method = "svmLinear",
                   tuneGrid = tune_grid,
                   trControl = train_control,scaled = TRUE)

# Print the results of the tuning
print(svm_model)
## Support Vector Machines with Linear Kernel 
## 
## 8131 samples
##   39 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 6504, 6505, 6505, 6506, 6504 
## Resampling results across tuning parameters:
## 
##   C      Accuracy   Kappa    
##   1e-01  0.7422242  0.4695042
##   1e+00  0.7841613  0.5544567
##   1e+01  0.8463913  0.6827210
##   1e+02  0.9097293  0.8154933
##   1e+03  0.9484710  0.8954868
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1000.
# Best model parameters
best_params <- svm_model$bestTune
cat("Best Parameters:\n")
## Best Parameters:
print(best_params)
##      C
## 5 1000

For linear model it could be seen that at C = 100, we attain an accuracy of 94 percent which suggests that its better to assume that the y was linearly seperable. Hence, now lets find the accuracy, ROC, AUC score of the test data.

Bias variance MSE caluclation

# Load necessary libraries
library(e1071)
library(ggplot2)

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine training and test sets into data frames
train_data <- as.data.frame(cbind(X_train, y_train))
test_data <- as.data.frame(X_test)

# Initialize vectors to store MSE metrics
C_values <- c(0.1, 1, 10, 100, 1000, 100000, 500000)
mse_values <- c()

# Loop through different values of C
for (C in C_values) {
  # Train the SVM model
  svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = C, scale = TRUE)
  
  # Predict on the test set
  predictions <- predict(svm_model, newdata = test_data)
  
  # Convert predictions and actual test values to numeric for MSE calculation
  predictions_numeric <- as.numeric(as.character(predictions))
  y_test_numeric <- as.numeric(as.character(y_test))
  
  # Calculate Mean Squared Error (MSE)
  mse <- mean((y_test_numeric - predictions_numeric)^2)
  mse_values <- c(mse_values, mse)
}

# Combine results into a data frame for plotting
results <- data.frame(
  C = C_values,
  MSE = mse_values
)

# Plotting MSE values
ggplot(results, aes(x = factor(C), y = MSE)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 3) +
  labs(title = "MSE for Different C Values",
       x = "C Values",
       y = "Mean Squared Error (MSE)") +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  scale_x_discrete(labels = function(x) paste0("C = ", x)) +
  geom_text(aes(label = round(MSE, 2)), vjust = -0.5)  # Add value labels above points

The analysis of bias, variance, and mean squared error (MSE) across different values of C shows: MSE dip: At C = 1000, the MSE is the minimum. Low Bias and Variance: The variance seems to be little higher which is expected for large C but it is not much high compared to 10, 100.

Hence, better to select C = 1000 for the SVM model to achieve an optimal balance between bias, variance, and MSE.

Model Evalution for Test Data

# Load necessary libraries
library(e1071)
library(pROC)  # For ROC and AUC

# Assuming you have already defined X_train, y_train, X_test, y_test

# Combine the training data into a data frame
train_data <- as.data.frame(cbind(X_train, y_train))

# Fit the SVM model with linear kernel
svm_model <- svm(y_train ~ ., data = train_data, kernel = "linear", cost = 1000, decision.values = TRUE,scaled = TRUE)

# Step 1: Make predictions on the test set
predictions <- predict(svm_model, newdata = as.data.frame(X_test))

# Step 2: Create confusion matrix
confusion_matrix <- table(Predicted = predictions, Actual = y_test)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(confusion_matrix)
##          Actual
## Predicted    0    1
##         0 1441  114
##         1   31 1123
confusion_matrix_caret <- confusionMatrix(confusion_matrix)

# Step 2: Extract Precision, Recall, and F1 Score
precision <- confusion_matrix_caret$byClass['Precision']
recall <- confusion_matrix_caret$byClass['Recall']
f1_score <- confusion_matrix_caret$byClass['F1']

# Display the metrics
cat("Precision:", precision, "\n")
## Precision: 0.9266881
cat("Recall:", recall, "\n")
## Recall: 0.9789402
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.9520978
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9464747
# Step 4: Get decision values for ROC curve
fitted <- attributes(predict(svm_model, newdata = as.data.frame(X_test), decision.values = TRUE))$decision.values

# Step 5: Generate ROC plot for the test set
roc_curve <- roc(y_test, -fitted)  # Note: Use negative for class labeling

# Plot the ROC curve
plot(roc_curve, main = "ROC Curve for Test Data")
# Add AUC to the plot
auc_value <- auc(roc_curve)
legend("bottomright", legend = paste("AUC =", round(auc_value, 2)), bty = "n")

The model achieved an impressive accuracy of 94% and a high AUC of 0.99, indicating excellent performance in classification. Additionally, the high precision, recall, and F1-score values reflect the model’s capability to accurately classify both class 0 and class 1. These results suggest that the model is highly effective in distinguishing between the two classes, making it reliable for practical applications.

Feature importance for the model

# Assuming you have already trained your SVM model (svm_model) using e1071

# Get coefficients from the SVM model
coefficients <- as.vector(svm_model$coefs) %*% svm_model$SV

# Get the intercept term
intercept <- svm_model$rho

# Combine coefficients and intercept into a single vector
all_coefficients <- c(intercept, coefficients)

# Print coefficients
cat("Coefficients (including intercept):\n")
## Coefficients (including intercept):
print(all_coefficients)
##  [1]  8.414745e+01  6.496813e-02 -5.874921e+02  3.056034e-02  1.983059e+01
##  [6]  1.000746e-01  2.364744e-01 -1.865601e-10  2.994796e-10  6.606222e-03
## [11]  2.135607e-10 -1.592937e-02  1.581999e-02  4.141880e-03 -2.317861e-02
## [16] -2.317393e-02 -1.273508e-02  4.583349e-02 -3.041952e-02  1.150636e-01
## [21]  4.433490e-02 -2.742884e-02  9.080968e-03 -7.041609e-03  3.055296e-02
## [26] -3.760284e-02  3.020458e-02  5.573072e-02  4.447850e-02 -4.367848e-10
## [31] -1.558534e-02 -6.564926e-03 -6.024834e-10 -3.561368e-02  1.181121e-03
## [36]  1.483217e-02 -4.697419e-02 -7.080146e-04  3.855689e-10 -8.795659e-03
# Check the number of coefficients
num_coefficients <- length(all_coefficients)
cat("Number of Coefficients (including intercept):", num_coefficients, "\n")
## Number of Coefficients (including intercept): 40
# Get feature names
feature_names <- colnames(X_train)

# Create a named vector for coefficients with feature names
named_coefficients <- setNames(coefficients, feature_names)

# Print named coefficients
cat("Feature Coefficients:\n")
## Feature Coefficients:
print(named_coefficients)
##          Rating   Reviews       Size    Price Content.Rating Last.Updated
## [1,] 0.06496813 -587.4921 0.03056034 19.83059      0.1000746    0.2364744
##      catART_AND_DESIGN catAUTO_AND_VEHICLES   catBEAUTY catBOOKS_AND_REFERENCE
## [1,]     -1.865601e-10         2.994796e-10 0.006606222           2.135607e-10
##      catBUSINESS  catCOMICS catCOMMUNICATION   catDATING catEDUCATION
## [1,] -0.01592937 0.01581999       0.00414188 -0.02317861  -0.02317393
##      catENTERTAINMENT  catEVENTS   catFAMILY catFINANCE catFOOD_AND_DRINK
## [1,]      -0.01273508 0.04583349 -0.03041952  0.1150636         0.0443349
##          catGAME catHEALTH_AND_FITNESS catHOUSE_AND_HOME catLIBRARIES_AND_DEMO
## [1,] -0.02742884           0.009080968      -0.007041609            0.03055296
##      catLIFESTYLE catMAPS_AND_NAVIGATION catMEDICAL catNEWS_AND_MAGAZINES
## [1,]  -0.03760284             0.03020458 0.05573072             0.0444785
##       catPARENTING catPERSONALIZATION catPHOTOGRAPHY catPRODUCTIVITY
## [1,] -4.367848e-10        -0.01558534   -0.006564926   -6.024834e-10
##      catSHOPPING   catSOCIAL  catSPORTS    catTOOLS catTRAVEL_AND_LOCAL
## [1,] -0.03561368 0.001181121 0.01483217 -0.04697419       -0.0007080146
##      catVIDEO_PLAYERS   catWEATHER
## [1,]     3.855689e-10 -0.008795659
## attr(,"names")
##  [1] "Rating"                 "Reviews"                "Size"                  
##  [4] "Price"                  "Content.Rating"         "Last.Updated"          
##  [7] "catART_AND_DESIGN"      "catAUTO_AND_VEHICLES"   "catBEAUTY"             
## [10] "catBOOKS_AND_REFERENCE" "catBUSINESS"            "catCOMICS"             
## [13] "catCOMMUNICATION"       "catDATING"              "catEDUCATION"          
## [16] "catENTERTAINMENT"       "catEVENTS"              "catFAMILY"             
## [19] "catFINANCE"             "catFOOD_AND_DRINK"      "catGAME"               
## [22] "catHEALTH_AND_FITNESS"  "catHOUSE_AND_HOME"      "catLIBRARIES_AND_DEMO" 
## [25] "catLIFESTYLE"           "catMAPS_AND_NAVIGATION" "catMEDICAL"            
## [28] "catNEWS_AND_MAGAZINES"  "catPARENTING"           "catPERSONALIZATION"    
## [31] "catPHOTOGRAPHY"         "catPRODUCTIVITY"        "catSHOPPING"           
## [34] "catSOCIAL"              "catSPORTS"              "catTOOLS"              
## [37] "catTRAVEL_AND_LOCAL"    "catVIDEO_PLAYERS"       "catWEATHER"
# Sort coefficients by absolute value for feature importance
sorted_coefficients <- sort(abs(named_coefficients), decreasing = TRUE)

# Print sorted feature importance
cat("Sorted Feature Importance:\n")
## Sorted Feature Importance:
print(sorted_coefficients)
##                Reviews                  Price           Last.Updated 
##           5.874921e+02           1.983059e+01           2.364744e-01 
##             catFINANCE         Content.Rating                 Rating 
##           1.150636e-01           1.000746e-01           6.496813e-02 
##             catMEDICAL               catTOOLS              catEVENTS 
##           5.573072e-02           4.697419e-02           4.583349e-02 
##  catNEWS_AND_MAGAZINES      catFOOD_AND_DRINK           catLIFESTYLE 
##           4.447850e-02           4.433490e-02           3.760284e-02 
##            catSHOPPING                   Size  catLIBRARIES_AND_DEMO 
##           3.561368e-02           3.056034e-02           3.055296e-02 
##              catFAMILY catMAPS_AND_NAVIGATION                catGAME 
##           3.041952e-02           3.020458e-02           2.742884e-02 
##              catDATING           catEDUCATION            catBUSINESS 
##           2.317861e-02           2.317393e-02           1.592937e-02 
##              catCOMICS     catPERSONALIZATION              catSPORTS 
##           1.581999e-02           1.558534e-02           1.483217e-02 
##       catENTERTAINMENT  catHEALTH_AND_FITNESS             catWEATHER 
##           1.273508e-02           9.080968e-03           8.795659e-03 
##      catHOUSE_AND_HOME              catBEAUTY         catPHOTOGRAPHY 
##           7.041609e-03           6.606222e-03           6.564926e-03 
##       catCOMMUNICATION              catSOCIAL    catTRAVEL_AND_LOCAL 
##           4.141880e-03           1.181121e-03           7.080146e-04 
##        catPRODUCTIVITY           catPARENTING       catVIDEO_PLAYERS 
##           6.024834e-10           4.367848e-10           3.855689e-10 
##   catAUTO_AND_VEHICLES catBOOKS_AND_REFERENCE      catART_AND_DESIGN 
##           2.994796e-10           2.135607e-10           1.865601e-10
top_coef = head(sorted_coefficients,15)
# Optional: Visualize feature importance
barplot(
  top_coef,
  main = "Feature Importance from SVM Coefficients",
  xlab = "Features",
  col = "steelblue",
  las = 2,
  cex.names = 0.3,# Adjust name size if necessary
  horiz = TRUE
)

The analysis shows that Reviews, Price, and Lastupdated are the top features influencing the model’s performance. Additionally, the leading app categories—Finance, Medical, Tools, Events, and News and Mangazines —suggest that focusing on these areas can enhance the chances of app success.